1 Little Rock Child Maltreatment

1.1 Background

This is an exploratory analysis of data-set that reports child maltreatment incidents in Little rock.

1.2 Map of Incidents

First we plot the incidents on a map like a point process using the latitude and longitudes extracted by ggmap.

rm(list = ls())
require(devtools)
# devtools::install_github("dkahle/ggmap", ref = "tidyup")
# devtools::install_github("thomasp85/patchwork")
library(pacman)
p_load("sf"             # Spatial data objects and methods
       ,"mapview"        # Interactive Map Viewing
       ,"ggmap"          # ggplot2 addon for base maps
       ,"cowplot" 
       ,"spatstat"       # KDE and other spatial functions
       ,"raster"         # cell-based spatial operations
       ,"tidyverse"      # data manipulation framework
       ,"Hmisc"          # using cut2(  functions for ggplot legends
       ,"fitdistrplus"   # Distribution fitting functions
       ,"lubridate"      # Power tools for handling dates
       ,"tidycensus" 
       ,"lwgeom" 
       ,"Hmisc" 
       ,"hrbrthemes" 
       ,"gridExtra" 
       ,"spdep"          # KNN functions
       ,"foreach" 
       ,"doParallel" 
       ,"corrplot" 
       ,"ranger"         # randomforest implimentation      
       ,"glmnet"         # for Ridge and Lasso Regression
       ,"knitr"          # for kable table
       ,"kableExtra" 
       ,"FNN"            # KNN for CPS vs. NN plots
       ,"groupdata2" 
       ,"htmltools" 
       ,"viridis" 
       ,"viridisLite")
# rm(list = ls())
setwd("~/Data/LR-Child")
crime <- read.csv(file="child-mt-accepted.csv", header=T,sep=",",
                  na.strings='NULL')
attach(crime)

crime$STR_NME <- crime$STR_NME %>% as.character %>% str_to_title 
crime$CTY_NME <- crime$CTY_NME %>% as.character %>% str_to_title

full_add <- paste(STR_NBR,STR_NME,STR_SFX_TYP_DESC, ",", 
                        CTY_NME,",", "AR")
head(full_add)
[1] "4511 LUDWIG  , LITTLE ROCK , AR"           
[2] "800 BEACH Street , NORTH LITTLE ROCK , AR" 
[3] "8208 WESTWOOD  Avenue , LITTLE ROCK , AR"  
[4] "6838 CANTRELL  Road , LITTLE ROCK , AR"    
[5] "400 COLLEGE PARK  , NORTH LITTLE ROCK , AR"
[6] "17809 LAWSON Road , LITTLE ROCK , AR"      
library(ggmap)
register_google(key = "<your key here>")
geocoded_latlon <- geocode(crime$full_add,output="latlon")
geocoded_latlon %>% head()
crime.gc <- cbind(crime, full_add,geocoded_latlon)
write.csv(crime.gc, "LR-child-mt-accepted-geocoded.csv")

1.3 Different City Names

setwd("~/Data/LR-Child")
crime.gc.read <- read.csv(file="LR-child-mt-accepted-geocoded.csv",
                     header=T, sep=",")

tab = as.data.frame(table(crime.gc.read$CTY_NME))
cat("Number of different city names", nrow(tab))
Number of different city names 207
tab = tab %>% filter(Freq > 50)

ggplot(tab, aes(x = reorder(Var1, -Freq), y=Freq)) + geom_bar(stat="identity") +
  xlab("City Names") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=10))

It appears there are many different city names in this data – what do they represent?

For this report, we filter the city names by “Little Rock” and use Little Rock boundaries to filter out addresses that are outside LR.

crime.gc.lr = crime.gc.read %>% filter(CTY_NME == "Little Rock")%>% 
  filter(lat >= 34.62606 & lat <= 34.82195) %>% 
  filter(lon >= -92.52091 & lon <= -92.15494)

nrow(crime.gc.lr)
[1] 5136
qmplot(lon, lat, data = crime.gc.lr)

qmplot(lon, lat, data = crime.gc.lr, geom = "point", maptype = "toner-lite", darken = .5, color = I("red"), legend = "topleft") +
  stat_density_2d(aes(fill = ..level..), geom = "polygon", alpha = .5, color = NA) +
  scale_fill_gradient2("Propensity", low = "white", mid = "yellow", high = "red", midpoint = 60)

1.4 Dates of Incidents

library(lubridate)
# str(crime.gc)
crime.gc.lr$Incident.Date <- ifelse(as.character(crime.gc.lr$Incident.Date) == "",as.character(crime.gc.lr$Referral.Date),as.character(crime.gc.lr$Incident.Date))

crime.gc.lr$Incident.Date <- as.Date(crime.gc.lr$Incident.Date, format = "%m/%d/%Y" )
crime.gc.lr$IncidentYear <- lubridate::year(ymd(crime.gc.lr$Incident.Date))
crime.gc.lr$IncidentWeek <- lubridate::week(ymd(crime.gc.lr$Incident.Date))


library(dplyr)

crime.yearly.ct <- crime.gc.lr %>% filter(IncidentYear>=2015) %>% group_by(IncidentYear) %>% dplyr::summarise(count = n()) 

crime.weekly.ct <- crime.gc.lr %>% filter(IncidentYear>=2015) %>% group_by(IncidentYear, IncidentWeek ) %>%
  dplyr::summarise(count = n())

1.4.1 Weekly Counts

The following figure shows the weekly incidence counts by years, and the data for 2018 stops at the exact same point where 2015 begins, the month of June.

library(ggplot2)
(crime.plot <- ggplot(crime.weekly.ct, aes(x = IncidentWeek, y = count, group = as.factor(IncidentYear), colour = as.factor(IncidentYear)))+
geom_line(stat = "identity")+geom_bar(alpha = 0.2,stat = "identity")+ylab("Incident Count")+
  xlab("Weeks") +
  facet_grid(IncidentYear~.,scales="free_y")+
  theme(legend.position="right"))

1.4.2 Yearly Counts

(crime.plot <- ggplot(crime.yearly.ct, aes(x = IncidentYear, y = count))+
  geom_bar(stat="identity") +ylab("Incident Count")+
  xlab("Years") +labs(title = "Yearly Counts") +
  theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))+
    theme(legend.position="right"))

2 Animated Map

This animation shows how the incidents moved over time - the animation shows the points on the map of LR over years from 2015 to 2018.

library(gganimate)
crime.ct.yr <- crime.gc.lr %>% filter(IncidentYear>=2015) %>% group_by(lon, lat, IncidentYear) %>% 
  dplyr::summarise(count = n()) 

p <- qmplot(lon, lat, data = crime.ct.yr, maptype = "toner-lite", color = I("red"))

p<- p + geom_point(aes(x = lon, y = lat, size = count),
             data = crime.ct.yr, colour = 'purple', alpha = .7) +
  labs(title = 'Year: {frame_time}', x = 'lon', y = 'lat') +
  transition_time(IncidentYear) 

animate(p, fps = 1, nframes = 4)

3 Frequent Itemset Mining

Frequent Pattern Mining: Frequent Itemset Mining: - Applications: Market Basket Analysis / Click stream Analysis / Web Link Analysis.

3.1 Market Basket Analysis (Motivation)

  • Building blocks problem shares some similarity with market basket analysis:

    1. mutual exclusivity
    2. sparsity
  • Mutual exclusivity: redundant to buy both multiple items of the same type but in the causeral population will buy from this category. - e.g. one customer is not likely to buy both Coke and Pepsi but a majority of the general population of soda drinkers rely on brown soda, - {Coke, Pepsi} is the building block for the soda requirement.

3.2 Notations

Formally, our goal here is to identify “building blocks” for child maltreatment from the binary incidence variable, observed as an \(m \times n\) sparse binary matrix, where \(m,n\) denote the number of observations and number of different allegations/events respectively. The idea here is to think of this data as a transaction data, where each sample represent a transaction and each cause an item. In the language of `itemset mining’, we have an item set \(I\) and a trasaction set \(D\), and each transaction in \(D\) contains a subset of the items in \(I\). In our case, each subject might have 1’s on a subset of the list of columns.

First, we define some preliminaries: a rule $ X Y$ implies \(X, Y \in I\) and \(X \cap Y = \phi\), where \(X\) is called RHS or ‘antecedent’, and ‘Y’ is called ‘consequent’. The support \(supp(X)\) of a itemset (think, cause-set) \(X\) is defined as the proportion of transactions (think, samples) in the data set which contain the itemset. The confidence of a rule is defined \(conf(X \Rightarrow Y ) = supp(X \cup Y )/supp(X)\), and mimics the conditional probability \(P(Y | X)\). A third interest measure lift is defined as \(lift(X \Rightarrow Y) = supp(X \cup Y )/(supp(X)\times supp(Y))\), which gives deviation of the support of the rule \(X \Rightarrow Y\) from the support expected under independence - i.e. higher lift means higher association.

An association rules is a rule that surpasses a user-specified minimum support and minimum confidence threshold, and to select the interesting association rules we impose further filter the rules (or rank them) by an additional interest measure - e.g. lift. There are other measures of interest such as Chi-square measure, conviction and leverage but we will skip them for the sake of simplicity.

3.2.1 R packages

Now, we use two R-packages arules and arulesViz to find a few “interesting” association rules (building blocks) and plot them. We first load the packages and create the data-sets from the raw data.

3.2.2 Item Frequency Plot

To see which items are important in the data set we can use the itemFrequencyPlot. To reduce the number of items, we only plot the item frequency for items with a support greater than 5% (using the parameter support). For better readability of the labels, we reduce the label size with the parameter cex.names.

library(arules)
library(arulesViz)
crime.lr <- as(as.matrix(crime.gc.lr[,10:130]),"transactions")
itemFrequencyPlot(crime.lr, support = 0.05)

3.3 Association rules using the Apriori algorithm

Next, we call the function apriori() to find all rules (the default association type for apriori()) with a minimum support of 1% and a confidence of 0.5. The summary() function gives an overview of the association rules found using this function.

rules <- apriori(crime.lr, parameter = list(support = 0.06, confidence = 0.9, 
                                         minlen = 1))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.9    0.1    1 none FALSE            TRUE       5    0.06      1
 maxlen target   ext
     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 308 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[112 item(s), 5136 transaction(s)] done [0.00s].
sorting and recoding items ... [15 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [11 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
summary(rules)
set of 11 rules

rule length distribution (lhs + rhs):sizes
 2 
11 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      2       2       2       2       2       2 

summary of quality measures:
    support          confidence      lift           count       
 Min.   :0.06094   Min.   :1    Min.   :1.733   Min.   : 313.0  
 1st Qu.:0.06532   1st Qu.:1    1st Qu.:1.733   1st Qu.: 335.5  
 Median :0.09404   Median :1    Median :1.733   Median : 483.0  
 Mean   :0.10141   Mean   :1    Mean   :2.454   Mean   : 520.8  
 3rd Qu.:0.10514   3rd Qu.:1    3rd Qu.:2.545   3rd Qu.: 540.0  
 Max.   :0.22118   Max.   :1    Max.   :7.234   Max.   :1136.0  

mining info:
     data ntransactions support confidence
 crime.lr          5136    0.06        0.9
kable(inspect(head(sort(rules, by ="lift"),20)),digits=4)
     lhs                                        rhs                             support confidence     lift count
[1]  {Sexual.Contact}                        => {Broad.Alleg...Sexual.Abuse} 0.10085670          1 7.233803   518
[2]  {Striking.Child.with.Closed.Fist}       => {Broad.Alleg...Abuse}        0.06094237          1 2.545094   313
[3]  {Striking.child..7.18yrs..on.face.head} => {Broad.Alleg...Abuse}        0.10942368          1 2.545094   562
[4]  {Cuts..Bruises..Welts}                  => {Broad.Alleg...Abuse}        0.15342679          1 2.545094   788
[5]  {Medical.Neglect}                       => {Broad.Alleg...Neglect}      0.06736760          1 1.732794   346
[6]  {Educational.Neglect}                   => {Broad.Alleg...Neglect}      0.08450156          1 1.732794   434
[7]  {Inadequate.Food}                       => {Broad.Alleg...Neglect}      0.06327882          1 1.732794   325
[8]  {Failure.to.Protect}                    => {Broad.Alleg...Neglect}      0.06172118          1 1.732794   317
[9]  {Environmental.Neglect}                 => {Broad.Alleg...Neglect}      0.09871495          1 1.732794   507
[10] {Broad.Alleg...Neglect..True.}          => {Broad.Alleg...Neglect}      0.09404206          1 1.732794   483
[11] {Inadequate.Supervision}                => {Broad.Alleg...Neglect}      0.22118380          1 1.732794  1136

3.3.1 Filtering Interesting Association Rules

One way of identifying the interesting association rules is to look at association rules with known “interesting” causes, and study their association strengths or measures of interest, such as, lift, confidence, support etc.

interestMeasure(rules, c("support", "chiSquare", "confidence", "coverage",               "lift"),crime.lr)
      support chiSquared confidence   coverage     lift
1  0.06736760   271.8612          1 0.06736760 1.732794
2  0.08450156   347.3871          1 0.08450156 1.732794
3  0.06327882   254.2463          1 0.06327882 1.732794
4  0.06172118   247.5762          1 0.06172118 1.732794
5  0.06094237   514.9998          1 0.06094237 2.545094
6  0.09871495   412.2184          1 0.09871495 1.732794
7  0.10085670  3591.3184          1 0.10085670 7.233803
8  0.10942368   975.0348          1 0.10942368 2.545094
9  0.09404206   390.6796          1 0.09404206 1.732794
10 0.15342679  1438.1913          1 0.15342679 2.545094
11 0.22118380  1068.8702          1 0.22118380 1.732794

3.4 Visualization

Here we show a few plots to understand these rules better.

3.4.1 Graph-based Visulization

Recall that for a rule \(X \Rightarrow Y\), X is called antecedent and Y is called consequent

This shows the rules (or itemsets) as a graph with items as labeled vertices, and rules (or itemsets) represented as vertices connected to items using arrows. For rules, the LHS items are connected with arrows pointing to the vertex representing the rule and the RHS has an arrow pointing to the item.

plot(rules, method="graph", engine = "htmlwidget")

3.4.2 Scatterplot of the rules

We can also do a basic scatterplot of the rules mined earlier by the apriori() function by their support (x-axis) and confidence (y-axis) and the shading of the points (rules) increases with their lift. The interesting rules should have high support, high confidence and high lift.

plot(rules, measure=c("support", "lift"), shading="confidence")

3.4.3 Matrix based Visualizations

The Matrix-based visualization techniques organize the antecedent and consequent itemsets on the x and y-axes, respectively. A selected interest measure is displayed at the intersection of the antecedent and consequent of a given rule. If no rule is available for a antecedent/consequent combination the intersection area is left blank.

Next, we plot the filtered rules using the matrix visualization. The color shading of the sqaures represent the interest measure, lift. The plot doesn’t display the long labels on the axis but prints out the antecedent and consequent on the terminal. The last argument is needed to create a tidy figure by putting rules with similar lift together.

plot(rules, method="matrix", measure="lift", control=list(reorder="support/confidence"))
Itemsets in Antecedent (LHS)
 [1] "{Inadequate.Supervision}"               
 [2] "{Cuts..Bruises..Welts}"                 
 [3] "{Striking.child..7.18yrs..on.face.head}"
 [4] "{Sexual.Contact}"                       
 [5] "{Environmental.Neglect}"                
 [6] "{Broad.Alleg...Neglect..True.}"         
 [7] "{Educational.Neglect}"                  
 [8] "{Medical.Neglect}"                      
 [9] "{Inadequate.Food}"                      
[10] "{Failure.to.Protect}"                   
[11] "{Striking.Child.with.Closed.Fist}"      
Itemsets in Consequent (RHS)
[1] "{Broad.Alleg...Neglect}"      "{Broad.Alleg...Abuse}"       
[3] "{Broad.Alleg...Sexual.Abuse}"